#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Author:Pxz
# @Time :2019/1/21 0021下午 5:44

import os
import re

try:
    import jieba
except Exception as e1:
    os.system('pip install jieba')
    import jieba


def get_default_file():
    #  获取停用词根目录
    d = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
    return os.path.join(d, 'data/stopwords.txt')


def stop_words():
    '''
    :return: 无序不重复的停用词表
    '''
    stopwords = get_default_file()
    with open(stopwords, 'r', encoding="UTF-8") as f:
        words = f.read().split('\n')
    return list(set(words))


def tokenizer(row):
    '''
    文档分词,只匹配中文
    :param row:
    :return:
    '''
    result = list()
    regex = r'^[\u4e00-\u9fa5_a-zA-Z]+$'
    row_sp = ''.join(row.split())
    row_cuts = jieba.cut(row_sp)
    for row_cut in row_cuts:
        if len(row_cut) < 2:  # 过滤长度小于2的词
            continue
        if row_cut not in stops and row_cut != '\r' and '\n' and re.match(regex, row_cut):
            result.append(row_cut)
    return result


stops = stop_words()

if __name__ == "__main__":
    print(get_default_file())
